- Explore your dataset, graphically
- Find relationship between variables
- Find differences between groups
13 February 2018
library("tidyverse")
## Loading tidyverse: ggplot2 ## Loading tidyverse: tibble ## Loading tidyverse: tidyr ## Loading tidyverse: readr ## Loading tidyverse: purrr ## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats ## lag(): dplyr, stats
alt text
Figure from http://r4ds.had.co.nz/tidy-data.html
Following three rules makes a dataset tidy: variables are in columns, observations are in rows, and values are in cells.
mydata <- read_csv("datasets/architect.csv")
## Parsed with column specification: ## cols( ## .default = col_double(), ## X1 = col_integer(), ## FileName = col_character(), ## genotype = col_character(), ## Time = col_integer(), ## TNLR = col_integer(), ## N2LR = col_integer(), ## Magnitude = col_integer(), ## Altitude = col_integer(), ## ExtPathLength = col_integer() ## )
## See spec(...) for full column specifications.
str(mydata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 647 obs. of 20 variables: ## $ X1 : int 1 2 3 4 5 6 7 8 9 10 ... ## $ FileName : chr "dense-3-1-12_1" "dense-3-1-12_1" "dense-3-1-12_1" "dense-3-1-12_1" ... ## $ genotype : chr "dense" "dense" "dense" "dense" ... ## $ Time : int 4 5 6 7 8 9 10 11 12 3 ... ## $ TRL : num 118 236 591 1594 3661 ... ## $ GRTR : num 118 118 354 1004 2067 ... ## $ L1R : num 118 236 354 472 591 ... ## $ GR1R : num 118 118 118 118 118 ... ## $ TNLR : int 0 3 13 22 29 36 44 54 54 0 ... ## $ TLRL : num 0.00 5.72e-05 2.36e+02 1.12e+03 3.07e+03 ... ## $ N2LR : int 0 3 13 22 29 36 44 54 54 0 ... ## $ L2LR : num 0.00 5.72e-05 2.36e+02 1.12e+03 3.07e+03 ... ## $ ML2LR : num 0.00 1.91e-05 1.82e+01 5.10e+01 1.06e+02 ... ## $ GR2L : num 0.00 5.72e-05 2.36e+02 8.86e+02 1.95e+03 ... ## $ D2LR : num 0 0.0127 0.0367 0.0466 0.0491 ... ## $ Height : num 116 231 348 465 581 ... ## $ Width : num 20.6 43.9 107 168.8 233 ... ## $ Magnitude : int 1 4 14 23 30 37 45 55 55 1 ... ## $ Altitude : int 1 4 14 23 30 37 45 55 55 1 ... ## $ ExtPathLength: int 1 13 118 298 494 739 1079 1594 1594 1 ... ## - attr(*, "spec")=List of 2 ## ..$ cols :List of 20 ## .. ..$ X1 : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ FileName : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ genotype : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Time : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ TRL : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ GRTR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ L1R : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ GR1R : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ TNLR : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ TLRL : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ N2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ L2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ ML2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ GR2L : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ D2LR : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ Height : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ Width : list() ## .. .. ..- attr(*, "class")= chr "collector_double" "collector" ## .. ..$ Magnitude : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ Altitude : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ ExtPathLength: list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## ..$ default: list() ## .. ..- attr(*, "class")= chr "collector_guess" "collector" ## ..- attr(*, "class")= chr "col_spec"
Use dplyr to filter data based on specific values. %>% is called a pipe and allows you to queue up operations.
mydata %>% filter(genotype == "dense") %>% head()
## # A tibble: 6 × 20 ## X1 FileName genotype Time TRL GRTR L1R ## <int> <chr> <chr> <int> <dbl> <dbl> <dbl> ## 1 1 dense-3-1-12_1 dense 4 118.1103 118.1103 118.1103 ## 2 2 dense-3-1-12_1 dense 5 236.2205 118.1103 236.2205 ## 3 3 dense-3-1-12_1 dense 6 590.5513 354.3308 354.3307 ## 4 4 dense-3-1-12_1 dense 7 1594.4887 1003.9373 472.4410 ## 5 5 dense-3-1-12_1 dense 8 3661.4180 2066.9294 590.5511 ## 6 6 dense-3-1-12_1 dense 9 6259.8433 2598.4253 708.6613 ## # ... with 13 more variables: GR1R <dbl>, TNLR <int>, TLRL <dbl>, ## # N2LR <int>, L2LR <dbl>, ML2LR <dbl>, GR2L <dbl>, D2LR <dbl>, ## # Height <dbl>, Width <dbl>, Magnitude <int>, Altitude <int>, ## # ExtPathLength <int>
You can also select, or drop specific columns using the select verb.
mydata %>% select(c(FileName, genotype, Time, Height, Width))%>% head()
## # A tibble: 6 × 5 ## FileName genotype Time Height Width ## <chr> <chr> <int> <dbl> <dbl> ## 1 dense-3-1-12_1 dense 4 115.9124 20.61023 ## 2 dense-3-1-12_1 dense 5 231.4529 43.94336 ## 3 dense-3-1-12_1 dense 6 348.0965 106.99701 ## 4 dense-3-1-12_1 dense 7 465.2197 168.76273 ## 5 dense-3-1-12_1 dense 8 581.1602 232.97046 ## 6 dense-3-1-12_1 dense 9 698.4634 289.51532
You can also select, or drop specific columns using the select verb.
mydata <- mydata %>% select(-c(X1)) mydata %>% head()
## # A tibble: 6 × 19 ## FileName genotype Time TRL GRTR L1R GR1R ## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> ## 1 dense-3-1-12_1 dense 4 118.1103 118.1103 118.1103 118.1103 ## 2 dense-3-1-12_1 dense 5 236.2205 118.1103 236.2205 118.1102 ## 3 dense-3-1-12_1 dense 6 590.5513 354.3308 354.3307 118.1103 ## 4 dense-3-1-12_1 dense 7 1594.4887 1003.9373 472.4410 118.1102 ## 5 dense-3-1-12_1 dense 8 3661.4180 2066.9294 590.5511 118.1102 ## 6 dense-3-1-12_1 dense 9 6259.8433 2598.4253 708.6613 118.1102 ## # ... with 12 more variables: TNLR <int>, TLRL <dbl>, N2LR <int>, ## # L2LR <dbl>, ML2LR <dbl>, GR2L <dbl>, D2LR <dbl>, Height <dbl>, ## # Width <dbl>, Magnitude <int>, Altitude <int>, ExtPathLength <int>
And you can create new variables using the mutate verb.
mydata %>% mutate(newvar = log(TRL))%>% select(c(genotype, Time, newvar)) %>% head()
## # A tibble: 6 × 3 ## genotype Time newvar ## <chr> <int> <dbl> ## 1 dense 4 4.771619 ## 2 dense 5 5.464766 ## 3 dense 6 6.381057 ## 4 dense 7 7.374308 ## 5 dense 8 8.205606 ## 6 dense 9 8.741910
alt text
ggplot ?Used to produce statistical graphics, main developer = Hadley Wickham
attempt to take the good things about base and lattice graphics and improve on them with a strong, underlying model "
based on The Grammar of Graphics by Leland Wilkinson, 2005
describes the meaning of what we do when we construct statistical graphics … More than a taxonomy … Computational system based on the underlying mathematics of representing statistical functions of data.
ggplot componentsdata: in ggplot2, data must be stored as an R data framecoordinate system: describes 2-D space that data is projected ontogeoms: describe type of geometric objects that represent dataaesthetics: describe visual characteristics that represent datascales: for each aesthetic, describe how visual characteristic is converted to display valuesstats: describe statistical transformations that typically summarize datafacets: describe how data is split into subsets and displayed as multiple small graphsdata and aesteticWe first create the plot, by setting the data and the aestetic.
myplot <- ggplot(data=mydata, aes(x=TRL, y=TNLR)) myplot
geometry - PointsWe need to add a geom to display the plot. Different geom can be used.
myplot + geom_point()
geometry - LinesWe need to add a geom to display the plot. Different geom can be used.
myplot + geom_line()
geometry - stepsWe need to add a geom to display the plot. Different geom can be used.
myplot + geom_step()
geomsThe advantage of using a layered approach, is that the layers can be combined. For instance, several geom can be used in the same plot.
myplot + geom_point() + geom_step()
We can remove the chart junk by using alternative themes
myplot + geom_point() + theme_bw()
We can remove the chart junk by using alternative themes
myplot + geom_point() + theme_classic()
We can remove the chart junk by using alternative themes
myplot + geom_point() + theme_minimal()
Let's use the different categories we have. For this, we add a colour argument in the aes. ggplot will automatically pick a discrete color scale.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point()
Let's use one of the variable as a continuous caterogy. For this, we add a colour argument in the aes. ggplot will automatically pick a continuous color scale.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Height)) + geom_point()
When using multiple categories (here, Species and Treatment), we can both use define different colors (colour) and point styles (shape) in the aes argument.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Time, shape=genotype)) + geom_point()
facetsFacets can be used to split the data and present them side to side.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point() + facet_wrap(~ genotype)
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess'
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point() + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess'
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point() + geom_smooth(se = FALSE, method="lm")
stats tot the plotsggplot has some built-in stat functions that can be directly used in the plots.
ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point() + stat_ellipse()
ggplot(data=mydata, aes(x=genotype, y=TRL)) + theme_bw() + geom_boxplot()
Now, can also try to see the data at once. For this, we need to play a bit with the initial data, to move from a wide to long format. We also need to normalize the dataset, for each variable, so we can vizualy compare them.
# Normilsation function
normalit<-function(m){
(m - min(m))/(max(m)-min(m))
}
Now, can also try to see the data at once. For this, we need to play a bit with the initial data, to move from a wide to long format. We also need to normalize the dataset, for each variable, so we can vizualy compare them.
mydatalong <- mydata %>% mutate_each(funs(normalit), which(sapply(., is.numeric))) %>% # Normalize eahc numeric column gather(TRL:ExtPathLength, key = "variable", value = "value") mydatalong %>% head()
## # A tibble: 6 × 5 ## FileName genotype Time variable value ## <chr> <chr> <dbl> <chr> <dbl> ## 1 dense-3-1-12_1 dense 0.2 TRL 0.002538073 ## 2 dense-3-1-12_1 dense 0.3 TRL 0.007614216 ## 3 dense-3-1-12_1 dense 0.4 TRL 0.022842647 ## 4 dense-3-1-12_1 dense 0.5 TRL 0.065989868 ## 5 dense-3-1-12_1 dense 0.6 TRL 0.154822364 ## 6 dense-3-1-12_1 dense 0.7 TRL 0.266497495
# Make a lin plot, for each variable mydatalong %>% filter(Time == max(Time)) %>% # Select just one time point (the last) ggplot(aes(variable, value, group=FileName, colour=genotype)) + geom_line() + facet_wrap(~genotype, nrow=2)+ theme_classic() + theme(text = element_text(size=9))
mydatalong %>% filter(Time == max(Time)) %>% # Select just one time point (the last) ggplot(aes(variable, value, group=FileName, colour=genotype)) + geom_line() + coord_polar() + # Change the coordinate system facet_wrap(~genotype, nrow=2)+ theme_classic() + theme(text = element_text(size=9))
mydatalong %>% filter(Time == max(Time)) %>% # Select just one time point (the last) ggplot(aes(variable, value, group=FileName, colour=genotype)) + geom_polygon(fill=NA) + # Close the radar plot coord_polar() + # Change the coordinate system facet_wrap(~genotype, nrow=2)+ theme_classic() + theme(text = element_text(size=9))
library(gganimate) pl <- mydatalong %>% ggplot(aes(variable, value, group=FileName, colour=genotype, frame=Time)) + geom_polygon(fill=NA) + # Close the radar plot coord_polar() + # Change the coordinate system facet_wrap(~genotype, nrow=2)+ theme_classic() + theme(text = element_text(size=9)) gganimate(pl)
plotly libraryPlotly creates leading open source tools for composing, editing, and sharing interactive data visualization via the Web.
library("plotly")
## ## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2': ## ## last_plot
## The following object is masked from 'package:stats': ## ## filter
## The following object is masked from 'package:graphics': ## ## layout
plotly works great with ggplotAny type of ggplot can be wrapped up in a plotly figure
pl <- ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + theme_bw() + geom_point() ggplotly(pl)
plotly works great with ggplotWe can choose what type of label we want to see
pl <- ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype, label=FileName)) + theme_bw() + geom_point() ggplotly(pl)
plotly works great with ggplotpl <- mydata %>% filter(Time == max(Time)) %>% ggplot(aes(x=genotype, y=TRL, label=FileName)) + geom_boxplot() ggplotly(pl )
Google is your friend !
This presentation: http://bit.ly/ggplot-pres
http://seananderson.ca/ggplot2-FISH554/
http://blog.echen.me/2012/01/17/quick-introduction-to-ggplot2/
http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html
http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3